Global Analysis of Education and Occupation

By Tyler Gorecki, Ricky Kuehn, Doruk Ozar, Luke Schneider, James Siegener

Import, Cleaning

Libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggcorrplot)
library(grid)
library(vcd)
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-8
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(pls)
## 
## Attaching package: 'pls'
## 
## The following object is masked from 'package:caret':
## 
##     R2
## 
## The following object is masked from 'package:stats':
## 
##     loadings
library(PRROC)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin

Load data

df = expectancy = read.csv('~/Downloads/linear_models_project_data.csv')
head(df)
##   X age        workclass fnlwgt education education.num     marital.status
## 1 0  39        State-gov  77516 Bachelors            13      Never-married
## 2 1  50 Self-emp-not-inc  83311 Bachelors            13 Married-civ-spouse
## 3 2  38          Private 215646   HS-grad             9           Divorced
## 4 3  53          Private 234721      11th             7 Married-civ-spouse
## 5 4  28          Private 338409 Bachelors            13 Married-civ-spouse
## 6 5  37          Private 284582   Masters            14 Married-civ-spouse
##          occupation  relationship  race    sex capital.gain capital.loss
## 1      Adm-clerical Not-in-family White   Male         2174            0
## 2   Exec-managerial       Husband White   Male            0            0
## 3 Handlers-cleaners Not-in-family White   Male            0            0
## 4 Handlers-cleaners       Husband Black   Male            0            0
## 5    Prof-specialty          Wife Black Female            0            0
## 6   Exec-managerial          Wife White Female            0            0
##   hours.per.week native.country income
## 1             40  United-States  <=50K
## 2             13  United-States  <=50K
## 3             40  United-States  <=50K
## 4             40  United-States  <=50K
## 5             40           Cuba  <=50K
## 6             40  United-States  <=50K

Drop missing values

df = replace(df, df=="NaN", NA)
df = df %>% drop_na()
df %>% head()
##   X age        workclass fnlwgt education education.num     marital.status
## 1 0  39        State-gov  77516 Bachelors            13      Never-married
## 2 1  50 Self-emp-not-inc  83311 Bachelors            13 Married-civ-spouse
## 3 2  38          Private 215646   HS-grad             9           Divorced
## 4 3  53          Private 234721      11th             7 Married-civ-spouse
## 5 4  28          Private 338409 Bachelors            13 Married-civ-spouse
## 6 5  37          Private 284582   Masters            14 Married-civ-spouse
##          occupation  relationship  race    sex capital.gain capital.loss
## 1      Adm-clerical Not-in-family White   Male         2174            0
## 2   Exec-managerial       Husband White   Male            0            0
## 3 Handlers-cleaners Not-in-family White   Male            0            0
## 4 Handlers-cleaners       Husband Black   Male            0            0
## 5    Prof-specialty          Wife Black Female            0            0
## 6   Exec-managerial          Wife White Female            0            0
##   hours.per.week native.country income
## 1             40  United-States  <=50K
## 2             13  United-States  <=50K
## 3             40  United-States  <=50K
## 4             40  United-States  <=50K
## 5             40           Cuba  <=50K
## 6             40  United-States  <=50K

Rename ‘income’ values

df = replace(df, df=="<=50K.", "<=50K")
df = replace(df, df==">50K.", ">50K")

df %>% head()
##   X age        workclass fnlwgt education education.num     marital.status
## 1 0  39        State-gov  77516 Bachelors            13      Never-married
## 2 1  50 Self-emp-not-inc  83311 Bachelors            13 Married-civ-spouse
## 3 2  38          Private 215646   HS-grad             9           Divorced
## 4 3  53          Private 234721      11th             7 Married-civ-spouse
## 5 4  28          Private 338409 Bachelors            13 Married-civ-spouse
## 6 5  37          Private 284582   Masters            14 Married-civ-spouse
##          occupation  relationship  race    sex capital.gain capital.loss
## 1      Adm-clerical Not-in-family White   Male         2174            0
## 2   Exec-managerial       Husband White   Male            0            0
## 3 Handlers-cleaners Not-in-family White   Male            0            0
## 4 Handlers-cleaners       Husband Black   Male            0            0
## 5    Prof-specialty          Wife Black Female            0            0
## 6   Exec-managerial          Wife White Female            0            0
##   hours.per.week native.country income
## 1             40  United-States  <=50K
## 2             13  United-States  <=50K
## 3             40  United-States  <=50K
## 4             40  United-States  <=50K
## 5             40           Cuba  <=50K
## 6             40  United-States  <=50K

Collapse education into bins

df$education = replace(df$education, df$education=="1st-4th", "Elementary/Middle")
df$education = replace(df$education, df$education=="5th-6th", "Elementary/Middle")
df$education = replace(df$education, df$education=="7th-8th", "Elementary/Middle")
df$education = replace(df$education, df$education=="Preschool", "Elementary/Middle")
df$education = replace(df$education, df$education=="Assoc-acdm", "Associates")
df$education = replace(df$education, df$education=="Assoc-voc", "Associates")
df$education = replace(df$education, df$education=="9th", "Highschool")
df$education = replace(df$education, df$education=="10th", "Highschool")
df$education = replace(df$education, df$education=="11th", "Highschool")
df$education = replace(df$education, df$education=="12th", "Highschool")
df$education = replace(df$education, df$education=="HS-grad", "Highschool")

head(df)
##   X age        workclass fnlwgt  education education.num     marital.status
## 1 0  39        State-gov  77516  Bachelors            13      Never-married
## 2 1  50 Self-emp-not-inc  83311  Bachelors            13 Married-civ-spouse
## 3 2  38          Private 215646 Highschool             9           Divorced
## 4 3  53          Private 234721 Highschool             7 Married-civ-spouse
## 5 4  28          Private 338409  Bachelors            13 Married-civ-spouse
## 6 5  37          Private 284582    Masters            14 Married-civ-spouse
##          occupation  relationship  race    sex capital.gain capital.loss
## 1      Adm-clerical Not-in-family White   Male         2174            0
## 2   Exec-managerial       Husband White   Male            0            0
## 3 Handlers-cleaners Not-in-family White   Male            0            0
## 4 Handlers-cleaners       Husband Black   Male            0            0
## 5    Prof-specialty          Wife Black Female            0            0
## 6   Exec-managerial          Wife White Female            0            0
##   hours.per.week native.country income
## 1             40  United-States  <=50K
## 2             13  United-States  <=50K
## 3             40  United-States  <=50K
## 4             40  United-States  <=50K
## 5             40           Cuba  <=50K
## 6             40  United-States  <=50K

EDA and Research Questions

How does the distribution of income vary across different age groups?

df$age_group <- cut(df$age, 
                      breaks = c(0, 20, 30, 40, 50, 60, 70, 80, Inf), 
                      labels = c("0-20", "21-30", "31-40", "41-50", "51-60", "61-70", "71-80", "81+"), 
                      right = FALSE)

df %>%
  group_by(income, age_group) %>%
  summarise(count = n()) %>%
  mutate(proportion = count / sum(count)) %>%
  ungroup() %>% 
  ggplot(aes(x = age_group, y = proportion, fill = income)) + 
  geom_bar(stat = 'identity', position = 'dodge')+theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  ggtitle("Proportion of Income by Age Group") + xlab("Age Group") + ylab("Proportion") + theme(plot.title = element_text(hjust = 0.5)) +
  labs(fill = "Income")
## `summarise()` has grouped output by 'income'. You can override using the
## `.groups` argument.

What is the relationship between the number of hours worked per week and income level?

df2 <- df
names(df2)[names(df2) == 'income'] <- 'Income'
df2 %>% ggplot(aes(x=hours.per.week, y=Income, color=Income, fill=Income))+geom_boxplot() + ggtitle("Hours worked each week per Income Level") + xlab("Hours worked per week") + ylab("Income Level") + theme(plot.title = element_text(hjust = 0.5))

What is the relationship between the number of hours worked per week and income level?

df %>% ggplot(aes(x=hours.per.week, y=income, color=income))+geom_violin()

Converting income column to dummy variables, find correlation between education number and income greater than 50K

data_dummies = fastDummies::dummy_cols(df, select_columns="income")
cor(data_dummies$education.num, data_dummies$`income_>50K`)
## [1] 0.3327999

Income level depending on education level

df$education = factor(df$education, levels = c("Elementary/Middle", "Highschool", 'Associates', "Some-college", 'Bachelors','Prof-school', 'Masters', 'Doctorate'))

df %>%
  group_by(income, education) %>%
  summarise(count = n()) %>%
  mutate(proportion = count / sum(count)) %>%
  ungroup() %>% 
  ggplot(aes(x = education, y = proportion, fill = income)) + 
  geom_bar(stat = 'identity', position = 'dodge')+theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  ggtitle("Proportion of Income by Education Level") + xlab("Education Level") + ylab("Proportion") + theme(plot.title = element_text(hjust = 0.5)) +
  labs(fill = "Income")
## `summarise()` has grouped output by 'income'. You can override using the
## `.groups` argument.

Interpretation: As we can see from the above graph, the most amount of people who earns greater than 50K is people who have bachelors degree and this is because in our data we have more people who have bachelors degree than people who have Masters or Doctorate degree. Same idea applies to Highschool column, because we have more data on people whose highest level of education is highschool, that is why it looks people who earns less than 50K is in highschool column

Income level depending on occupation

df %>%
  group_by(income, occupation) %>%
  summarise(count = n()) %>%
  mutate(proportion = count / sum(count)) %>%
  ungroup() %>% 
  ggplot(aes(x = occupation, y = proportion, fill = income)) + 
  geom_bar(stat = 'identity', position = 'dodge')+theme(axis.text.x = element_text(angle = 45, hjust = 1)) + ggtitle("Income Level proportion per Occupation") + xlab("Occupation") + ylab("Proportion") + theme(plot.title = element_text(hjust = 0.5)) +
  labs(fill = "Income")
## `summarise()` has grouped output by 'income'. You can override using the
## `.groups` argument.

Interpretation: As we can see from the bar graph above, we can say that the most amount of people who earns above 50K is people who has executive/managerial positions. After that it comes people who has a position in Prof/specialty. These makes sense because in real life as well, Executive managers earn a lot of money and Professors who make a lot of research can earn a lot of money. On the other hand, the most amount of people who make less than 50K is people who work in clerical jobs. These might include data entring, answering phone calls, filling paperwork and etc.

Correlation Matrix between all the numerical variables.

cor_mat <- cor(df[,c('age',"education.num", 'capital.gain','capital.loss','hours.per.week')])
ggcorrplot(cor_mat,lab=TRUE, type='full')

Interpretation: As you can see in the correlation matrix, multicollinearity is not an issue becase there are no variables that are highly correlated with each other

What is the relationship between mean hours.per.week and age?

mean_df <- df %>%
  group_by(age) %>%
  summarise(mean_hours_per_week = mean(hours.per.week))

ggplot(mean_df, aes(x = mean_hours_per_week, y = age)) +
  geom_point(color = "red") +
  labs(title = "Scatterplot of Mean Hours Per Week vs. Age",
       x = "Mean Hours Per Week",
       y = "Age") +
  theme_minimal()

What is the relationship between marital status and age?

ggplot(df, aes(x = marital.status, y = age, fill = marital.status)) +
  geom_boxplot() +
  labs(title = "Boxplot of Age by Marital Status",
       x = "Marital Status",
       y = "Age") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5))

Examine distribution of capital loss and capital gain

value_counts <- table(df$capital.loss)
df_capital.loss_count <- data.frame(value_counts)
df_capital.loss_count$pct <- round(100*df_capital.loss_count$Freq / sum(df_capital.loss_count$Freq),2)
head(df_capital.loss_count)
##   Var1  Freq   pct
## 1    0 43082 95.27
## 2  155     1  0.00
## 3  213     5  0.01
## 4  323     5  0.01
## 5  419     1  0.00
## 6  625    17  0.04
value_counts2 <- table(df$capital.gain)
df_capital.gain_count <- data.frame(value_counts2)
df_capital.gain_count$pct <- round(100*df_capital.gain_count$Freq / sum(df_capital.gain_count$Freq),2)
head(df_capital.gain_count)
##   Var1  Freq   pct
## 1    0 41432 91.62
## 2  114     8  0.02
## 3  401     2  0.00
## 4  594    42  0.09
## 5  914    10  0.02
## 6  991     4  0.01

Interpretation: As we can see from the above tables, most of the people in the dataset has 0 capital loss and capital gain. This is because in real life, most of the people do not have capital loss or gain.

Association between education level and income

contingency_table = table(df$education, df$income)
chisq.test(contingency_table)
## 
##  Pearson's Chi-squared test
## 
## data:  contingency_table
## X-squared = 5814.2, df = 7, p-value < 2.2e-16

Interpretation: As we can see from the Chi-squared test, education and income have a very high association because the chi-square value is extremely high and p value is really low. This means that we have to reject null hypothesis and conclude that there is a high association between education and income.

Association between age group and income

contingency_table = table(df$age_group, df$income)
chisq.test(contingency_table)
## 
##  Pearson's Chi-squared test
## 
## data:  contingency_table
## X-squared = 4234.6, df = 7, p-value < 2.2e-16

Interpretation: As we can see from the Chi-squared test, age groups and income have a very high association because the chi-square value is extremely high and p value is really low. This means that we have to reject null hypothesis and conclude that there is a high association between age groups and income.

Logistic Regression with Lasso and Cross Validation

Train Test split by 80% training and 20% testing datasets

X = model.matrix(income~0+., data=df)
X = scale(X)
y= df$income

train_index = sample(1:nrow(df), 0.8 * nrow(df))

train_X = X[train_index, ]
train_y = y[train_index]

test_X = X[-train_index, ]
test_y = y[-train_index]

10 fold cross validation and finding best lambda for ‘income’ categorical variable

set.seed(123)
cv_model = cv.glmnet(x=train_X, y=train_y, alpha=1, family="binomial", type.measure = "class", nfolds = 10)

best_lambda <- cv_model$lambda.min
best_lambda
## [1] 0.0001231903

Plot cross validation model

plot(cv_model)

Store names of predictor variables

coefficient_matrix = as.matrix(coef(cv_model))

non_zero_indices <- which(coefficient_matrix != 0)
non_zero_coefs <- coefficient_matrix[non_zero_indices, ]

names_of_predictors = rownames(coefficient_matrix)[non_zero_indices]
#cbind(rownames(coefficient_matrix)[non_zero_indices], coefficient_matrix[coefficient_matrix!=0])

Finalize logistic model using the ideal lambda value

final_model <- glmnet(train_X, train_y, alpha = 1, family = "binomial", lambda = best_lambda)

Cast test_y data as 1s and 0s

#cbind(predicted_classes, test_y)
test_y = ifelse(test_y == ">50K", 1, 0)

Make predictions, construct confusion matrix where 0 represents individuals who earn <= 50K and 1 represents individuals who earn > 50K

predictions <- predict(final_model, newx = test_X, type = "response")
#predicted_classes <- ifelse(predictions > 0.5, ">50K", "<=50K")
predicted_classes <- ifelse(predictions > 0.5, 1, 0)

confusion <- confusionMatrix(factor(predicted_classes), factor(test_y))
confusion_df <- data.frame(confusion$table)
confusion_df$Color <- with(confusion_df, ifelse(Prediction == Reference, "green", "red"))

ggplot(data = confusion_df, aes(x = factor(Prediction), y = factor(Reference), fill = Color)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Freq), color = "black", vjust = 1, size = 10) +
  scale_fill_manual(values = c("green" = "green", "red" = "red")) +
  theme_minimal() +
  labs(x = "Predicted", y = "Actual", fill = "Count") +
  ggtitle("Confusion Matrix Heatmap") +
  scale_y_discrete(limits = rev(levels(factor(confusion_df$Reference)))) + 
  theme(plot.title = element_text(hjust = 0.5))

Find R squared value of logistic model

print(paste("R squared value:", round(caret::R2(predicted_classes, test_y), 4)))
## [1] "R squared value: 0.3265"

Precision, Recall, F1

tn = confusion$table[1, 1]
tp = confusion$table[2, 2]
fp = confusion$table[2, 1]
fn = confusion$table[1, 2]

precision <- tp / (tp + fp)
recall <- tp / (tp + fn)

# Calculate F1 Score
f1_score <- 2 * (precision * recall) / (precision + recall)
print(paste("Precision:", round(precision,3)))
## [1] "Precision: 0.733"
print(paste("Recall:   ", round(recall, 3)))
## [1] "Recall:    0.607"
print(paste("F1 Score: ", round(f1_score,3)))
## [1] "F1 Score:  0.664"

Calculate accuracy

accuracy <- mean(predicted_classes == test_y)
print(paste("Testing Accuracy:", round(100*accuracy, 3)))
## [1] "Testing Accuracy: 84.809"

Plot ROC curve figure 1

PRROC_obj <- roc.curve(scores.class0 = predicted_classes, weights.class0=test_y, curve=TRUE)
plot(PRROC_obj)

Plot ROC curve figure 2

roc = roc(test_y, predicted_classes)
## Setting levels: control = 0, case = 1
## Warning in roc.default(test_y, predicted_classes): Deprecated use a matrix as
## predictor. Unexpected results may be produced, please pass a numeric vector.
## Setting direction: controls < cases
roc_dat = data.frame(TPR=roc$sensitivities, FPR=1-roc$specificities)
ggplot(roc_dat, aes(x=FPR, y=TPR))+geom_line()

Linear Regression with Lasso and Cross Validation

Train Test split by 80% training and 20% testing datasets

df3 <- df
df3$capital.gain <- sqrt(df$capital.gain)
df3$capital.loss <- sqrt(df$capital.loss)
# df3['over50'] <- df$age > 50 
X = model.matrix(age~0+.-age_group, data=df3)
X = scale(X)
y= df3$age

train_index = sample(1:nrow(df3), 0.8 * nrow(df3))

train_X = X[train_index, ]
train_y = y[train_index]

test_X = X[-train_index, ]
test_y = y[-train_index]
ggplot(data = df3, aes(x = df3$hours.per.week, y = df3$age)) + geom_point()
## Warning: Use of `df3$hours.per.week` is discouraged.
## ℹ Use `hours.per.week` instead.
## Warning: Use of `df3$age` is discouraged.
## ℹ Use `age` instead.

10 fold cross validation and finding best lambda for ‘age’ numeric variable

set.seed(123)
cv_model = cv.glmnet(x=train_X, y=train_y, alpha=1, nfolds = 10)

best_lambda <- cv_model$lambda.1se
best_lambda
## [1] 0.08800013

Plot cross validation model

plot(cv_model)

Store names of predictor variables

coefficient_matrix = as.matrix(coef(cv_model))
non_zero_indices <- which(coefficient_matrix != 0)


non_zero_coefs <- coefficient_matrix[non_zero_indices, ]


names_of_predictors = rownames(coefficient_matrix)[non_zero_indices]
names_of_predictors
##  [1] "(Intercept)"                         "X"                                  
##  [3] "workclassFederal-gov"                "workclassPrivate"                   
##  [5] "workclassSelf-emp-inc"               "workclassSelf-emp-not-inc"          
##  [7] "workclassState-gov"                  "workclassWithout-pay"               
##  [9] "fnlwgt"                              "educationHighschool"                
## [11] "educationAssociates"                 "educationSome-college"              
## [13] "educationProf-school"                "educationMasters"                   
## [15] "educationDoctorate"                  "education.num"                      
## [17] "marital.statusMarried-AF-spouse"     "marital.statusMarried-spouse-absent"
## [19] "marital.statusNever-married"         "marital.statusSeparated"            
## [21] "marital.statusWidowed"               "occupationArmed-Forces"             
## [23] "occupationCraft-repair"              "occupationExec-managerial"          
## [25] "occupationFarming-fishing"           "occupationHandlers-cleaners"        
## [27] "occupationMachine-op-inspct"         "occupationOther-service"            
## [29] "occupationPriv-house-serv"           "occupationProtective-serv"          
## [31] "occupationTech-support"              "occupationTransport-moving"         
## [33] "relationshipNot-in-family"           "relationshipOther-relative"         
## [35] "relationshipOwn-child"               "relationshipWife"                   
## [37] "raceBlack"                           "raceOther"                          
## [39] "capital.gain"                        "capital.loss"                       
## [41] "hours.per.week"                      "native.countryCanada"               
## [43] "native.countryCuba"                  "native.countryEl-Salvador"          
## [45] "native.countryGreece"                "native.countryGuatemala"            
## [47] "native.countryHong"                  "native.countryHungary"              
## [49] "native.countryIndia"                 "native.countryIran"                 
## [51] "native.countryItaly"                 "native.countryMexico"               
## [53] "native.countryPhilippines"           "native.countryPoland"               
## [55] "native.countryScotland"              "native.countryTaiwan"               
## [57] "income>50K"

Finalize linear model using the ideal lambda value

final_model <- glmnet(train_X, train_y, alpha = 1, lambda = best_lambda)

Plot cut-off point of Lasso Regression

plot(glmnet(train_X, train_y, alpha = 1), xvar = "lambda")+abline(v=log(best_lambda))

## integer(0)

Predict ‘age’ using test set

predictions <- predict(final_model, newx = test_X, type = "response")
residuals <- predictions - test_y

RMSE, MAPE, R2

print(paste("Root Mean Squared Error:", round(RMSE(predictions, test_y), 3)))
## [1] "Root Mean Squared Error: 10.296"
print(paste("Mean Absolute Error:", round(MAE(predictions, test_y), 3)))
## [1] "Mean Absolute Error: 8.093"
print(paste("Mean Squared Error:", round(mean((predictions - test_y)^2), 3)))
## [1] "Mean Squared Error: 106.017"
print(paste("Mean Absolute Percentage Error:", round(mean(abs((test_y-predictions)/test_y))*100,3)))
## [1] "Mean Absolute Percentage Error: 22.492"
print(paste("R squared:", round(caret::R2(predictions, test_y),5)))
## [1] "R squared: 0.39907"

*** Linearity Assumptions

predres <- data.frame(predictions = predictions, residuals = residuals)

ggplot(data = predres, aes(x=predictions, y=residuals)) + geom_point() + 
  geom_hline(yintercept = 0, color = 'red')

ggplot(predres, aes(sample=residuals))+stat_qq()+stat_qq_line(color='red')

PCA with categorical variables converted to factors

train_index = sample(1:nrow(df), 0.8 * nrow(df))

df2 <- df[,-c(17)]

train = df2[train_index,]
test = df2[train_index,]

pcr_model <- pcr(age~., data = train,scale = T)
summary(pcr_model)
## Data:    X dimension: 36177 89 
##  Y dimension: 36177 1
## Fit method: svdpc
## Number of components considered: 89
## TRAINING: % variance explained
##      1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X      4.181     7.17    9.989    12.15    14.09    15.99    17.70    19.37
## age   11.355    13.02   14.901    21.91    22.38    26.58    26.61    30.42
##      9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X      21.00     22.52     23.97     25.37     26.71     28.04     29.34
## age    30.49     30.56     30.58     30.95     31.07     31.17     31.78
##      16 comps  17 comps  18 comps  19 comps  20 comps  21 comps  22 comps
## X       30.63     31.91     33.18     34.43     35.67     36.90     38.10
## age     31.83     31.85     31.96     31.98     32.09     32.15     32.19
##      23 comps  24 comps  25 comps  26 comps  27 comps  28 comps  29 comps
## X       39.30     40.50     41.67     42.84     44.00     45.16     46.31
## age     32.22     32.26     32.44     33.83     33.84     34.06     34.06
##      30 comps  31 comps  32 comps  33 comps  34 comps  35 comps  36 comps
## X       47.46     48.59     49.73     50.86     51.99     53.12     54.24
## age     34.12     34.14     34.36     34.37     34.38     34.44     34.44
##      37 comps  38 comps  39 comps  40 comps  41 comps  42 comps  43 comps
## X       55.37     56.50     57.62     58.75     59.87     61.00     62.12
## age     34.45     34.45     34.46     34.46     34.46     34.46     34.46
##      44 comps  45 comps  46 comps  47 comps  48 comps  49 comps  50 comps
## X       63.25     64.37     65.50     66.62     67.74     68.87     69.99
## age     34.46     34.47     34.48     34.48     34.48     34.53     34.55
##      51 comps  52 comps  53 comps  54 comps  55 comps  56 comps  57 comps
## X       71.11     72.22     73.34     74.46     75.57     76.67     77.78
## age     34.56     34.63     34.64     34.68     34.68     34.80     35.21
##      58 comps  59 comps  60 comps  61 comps  62 comps  63 comps  64 comps
## X       78.87     79.96     81.05     82.14     83.22     84.29     85.35
## age     35.25     35.26     35.27     35.34     35.39     35.74     35.75
##      65 comps  66 comps  67 comps  68 comps  69 comps  70 comps  71 comps
## X       86.39     87.43     88.45     89.46     90.47     91.46     92.40
## age     35.80     35.85     35.86     35.90     36.19     36.19     37.39
##      72 comps  73 comps  74 comps  75 comps  76 comps  77 comps  78 comps
## X       93.33     94.21     95.07     95.87     96.63     97.37     97.97
## age     37.47     37.59     37.78     37.92     37.92     38.49     38.50
##      79 comps  80 comps  81 comps  82 comps  83 comps  84 comps  85 comps
## X       98.54     98.95     99.25     99.49     99.72     99.86     99.93
## age     38.56     39.17     39.17     39.57     40.81     40.95     40.95
##      86 comps  87 comps  88 comps  89 comps
## X       99.98     99.99    100.00    100.00
## age     40.98     41.17     41.28     41.28

RMSE

predictions <- predict(pcr_model, newdata = train, ncomp = 75)
residuals <- predictions - test$age
rmse <- sqrt(mean((test$age - predictions)^2))
rmse
## [1] 10.40209

*** Linearity Assumptions

predres <- data.frame(predictions = predictions, residuals = residuals)

ggplot(data = predres, aes(x=predictions, y=residuals)) + geom_point() + 
  geom_hline(yintercept = 0, color = 'red')

ggplot(predres, aes(sample=residuals))+stat_qq()+stat_qq_line(color='red')

Random Forest model

rf.fit <- randomForest(train_X[,-1], train_y, ntree = 500, mtry = 10)
rf.fit
## 
## Call:
##  randomForest(x = train_X[, -1], y = train_y, ntree = 500, mtry = 10) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 10
## 
##           Mean of squared residuals: 94.39412
##                     % Var explained: 45.84
predictions <- predict(rf.fit, test_X)
rmse <- sqrt(mean((predictions - test_y)^2))
print(paste("Root Mean Squared Error:", rmse))
## [1] "Root Mean Squared Error: 9.84802586715378"
varImpPlot(rf.fit)

results <- data.frame(Actual = test_y, Predicted = predictions)

# Plot predicted vs actual values using ggplot2
ggplot(results, aes(x=Predicted, y=Actual)) +
  geom_point() +
  geom_abline(slope=1, intercept=0, color="red") +
  labs(title="Actual vs Predicted Age", x="Predicted Age", y="Actual Age") +
  geom_smooth(method = 'lm', color = 'blue') + 
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Random forest model log transform response

library(randomForest)
train_y = log(train_y)
test_y = log(test_y)
rf.fit <- randomForest(train_X[,-1], train_y, ntree = 500, mtry = 10)
rf.fit
## 
## Call:
##  randomForest(x = train_X[, -1], y = train_y, ntree = 500, mtry = 10) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 10
## 
##           Mean of squared residuals: 0.05966574
##                     % Var explained: 51.34
predictions <- predict(rf.fit, test_X)
rmse <- sqrt(mean((predictions - test_y)^2))
rmse_orig <- sqrt(mean((exp(predictions) - exp(test_y))^2))
print(paste("Root Mean Squared Error:", rmse))
## [1] "Root Mean Squared Error: 0.247449244148933"
varImpPlot(rf.fit)

results <- data.frame(Actual = exp(test_y), Predicted = exp(predictions))

# Plot predicted vs actual values using ggplot2
ggplot(results, aes(x=Predicted, y=Actual)) +
  geom_point() +
  geom_abline(slope=1, intercept=0, color="red") +
  labs(title="Actual vs Predicted Age", x="Predicted Age", y="Actual Age") +
  geom_smooth(method = 'lm', color = 'blue') + 
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'